Importing All Necessary Libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import nltk
import re
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
Importing Dataset
# Importing dataset
review = pd.read_csv(r"E:\HASAN\WLU UNIVERSITY\1st Semester (FALL)\CP-640 - Machine Learning\Project\Womens Clothing E-Commerce Reviews.csv")
review.head()
| Unnamed: 0 | Clothing ID | Age | Title | Review Text | Rating | Recommended IND | Positive Feedback Count | Division Name | Department Name | Class Name | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 767 | 33 | NaN | Absolutely wonderful - silky and sexy and comf... | 4 | 1 | 0 | Initmates | Intimate | Intimates |
| 1 | 1 | 1080 | 34 | NaN | Love this dress! it's sooo pretty. i happene... | 5 | 1 | 4 | General | Dresses | Dresses |
| 2 | 2 | 1077 | 60 | Some major design flaws | I had such high hopes for this dress and reall... | 3 | 0 | 0 | General | Dresses | Dresses |
| 3 | 3 | 1049 | 50 | My favorite buy! | I love, love, love this jumpsuit. it's fun, fl... | 5 | 1 | 0 | General Petite | Bottoms | Pants |
| 4 | 4 | 847 | 47 | Flattering shirt | This shirt is very flattering to all due to th... | 5 | 1 | 6 | General | Tops | Blouses |
Data Cleaning
# Deleting first unnamed column
review = review.loc[:, ~review.columns.str.contains('^Unnamed')]
review.head()
| Clothing ID | Age | Title | Review Text | Rating | Recommended IND | Positive Feedback Count | Division Name | Department Name | Class Name | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 767 | 33 | NaN | Absolutely wonderful - silky and sexy and comf... | 4 | 1 | 0 | Initmates | Intimate | Intimates |
| 1 | 1080 | 34 | NaN | Love this dress! it's sooo pretty. i happene... | 5 | 1 | 4 | General | Dresses | Dresses |
| 2 | 1077 | 60 | Some major design flaws | I had such high hopes for this dress and reall... | 3 | 0 | 0 | General | Dresses | Dresses |
| 3 | 1049 | 50 | My favorite buy! | I love, love, love this jumpsuit. it's fun, fl... | 5 | 1 | 0 | General Petite | Bottoms | Pants |
| 4 | 847 | 47 | Flattering shirt | This shirt is very flattering to all due to th... | 5 | 1 | 6 | General | Tops | Blouses |
# Checking null value
review.isnull().sum()
Clothing ID 0 Age 0 Title 3810 Review Text 845 Rating 0 Recommended IND 0 Positive Feedback Count 0 Division Name 14 Department Name 14 Class Name 14 dtype: int64
# Deleting null values because it is less than 30% of whole data
review = review.dropna(axis=0)
# Checking null value after removing
review.isnull().sum()
Clothing ID 0 Age 0 Title 0 Review Text 0 Rating 0 Recommended IND 0 Positive Feedback Count 0 Division Name 0 Department Name 0 Class Name 0 dtype: int64
Data Exploration
# Shows that there are 19662 rows and 10 columns
review.shape
(19662, 10)
# Shows datatype of each column
review.dtypes
Clothing ID int64 Age int64 Title object Review Text object Rating int64 Recommended IND int64 Positive Feedback Count int64 Division Name object Department Name object Class Name object dtype: object
# Shows count, mean, standard deviation, minimum, maximum, 25% 50% 75% percentiles
review.describe()
| Clothing ID | Age | Rating | Recommended IND | Positive Feedback Count | |
|---|---|---|---|---|---|
| count | 19662.000000 | 19662.000000 | 19662.000000 | 19662.000000 | 19662.000000 |
| mean | 921.297274 | 43.260808 | 4.183145 | 0.818177 | 2.652477 |
| std | 200.227528 | 12.258122 | 1.112224 | 0.385708 | 5.834285 |
| min | 1.000000 | 18.000000 | 1.000000 | 0.000000 | 0.000000 |
| 25% | 861.000000 | 34.000000 | 4.000000 | 1.000000 | 0.000000 |
| 50% | 936.000000 | 41.000000 | 5.000000 | 1.000000 | 1.000000 |
| 75% | 1078.000000 | 52.000000 | 5.000000 | 1.000000 | 3.000000 |
| max | 1205.000000 | 99.000000 | 5.000000 | 1.000000 | 122.000000 |
review.describe(include=object)
| Title | Review Text | Division Name | Department Name | Class Name | |
|---|---|---|---|---|---|
| count | 19662 | 19662 | 19662 | 19662 | 19662 |
| unique | 13983 | 19656 | 3 | 6 | 20 |
| top | Love it! | Perfect fit and i've gotten so many compliment... | General | Tops | Dresses |
| freq | 136 | 3 | 11664 | 8713 | 5371 |
print(review.groupby('Age').size())
Age
18 4
19 28
20 91
21 75
22 106
...
91 3
92 1
93 2
94 1
99 1
Length: 77, dtype: int64
sns.histplot(review['Age'])
plt.title('Count of Age of Reviewer')
plt.show()
ri = review.groupby('Recommended IND').size()
print(ri)
Recommended IND 0 3575 1 16087 dtype: int64
ri_labels = ['Not Recommended', 'Recommended']
plt.pie(ri, labels=ri_labels, autopct='%.1f%%')
plt.title('Number of Positive and Negative Reviews')
plt.legend()
plt.show()
print(review.groupby('Division Name').size())
Division Name General 11664 General Petite 6778 Initmates 1220 dtype: int64
sns.countplot(x = review['Division Name'])
plt.title('Number of Different Divisions')
plt.show()
rdn = review.groupby('Department Name').size()
print(rdn)
Department Name Bottoms 3184 Dresses 5371 Intimate 1408 Jackets 879 Tops 8713 Trend 107 dtype: int64
rdn_labels = review['Department Name'].unique()
plt.pie(rdn, labels=rdn_labels, autopct='%.1f%%')
plt.title('Number of Different Department')
plt.legend()
plt.show()
print(review.groupby('Class Name').size())
Class Name Blouses 2587 Casual bottoms 1 Chemises 1 Dresses 5371 Fine gauge 927 Intimates 120 Jackets 598 Jeans 970 Knits 3981 Layering 115 Legwear 131 Lounge 574 Outerwear 281 Pants 1157 Shorts 260 Skirts 796 Sleep 174 Sweaters 1218 Swim 293 Trend 107 dtype: int64
sns.countplot(x = review['Class Name'])
plt.xticks(rotation=90)
plt.title('Number of Different Classes')
plt.show()
rr = review.groupby('Rating').size()
print(rr)
Rating 1 691 2 1360 3 2464 4 4289 5 10858 dtype: int64
rr_labels = review['Rating'].unique()
plt.pie(rr, labels=rr_labels, autopct='%.1f%%')
plt.title('Number of Rating from 1 to 5')
plt.legend()
plt.show()
print(review.groupby('Positive Feedback Count').size())
Positive Feedback Count
0 8930
1 3502
2 1923
3 1258
4 803
...
95 1
98 1
99 1
108 1
122 1
Length: 79, dtype: int64
sns.set(rc = {'figure.figsize':(18,5)})
sns.countplot(x = review['Positive Feedback Count'], label="Number of Visitors")
plt.xticks(rotation=90)
plt.title('Number of Positive Feedbacks')
plt.show()
g = sns.catplot(data = review, x ="Rating", hue = "Recommended IND", kind='count', height=7, aspect=2.5, legend_out=False)
plt.title('Rating Distribution By Recommendation', fontsize=26)
plt.xlabel("Rating", fontsize=20)
plt.ylabel("Number of Recommendations", fontsize=20)
plt.legend(title='Recommendation Indicator', loc='upper left', labels=['Not Recomnended', 'Recomnended'], fontsize='x-large', title_fontsize='24')
ax = g.facet_axis(0, 0)
for p in ax.patches:
ax.text(p.get_x() + 0.12,
p.get_height() * 1.025,
'{0:.0f}'.format(p.get_height()),
color='black', rotation='horizontal', size='large')
plt.show()
Data Preprocessing
review['Title'].loc[510]
'Gorgeous, flattering blouse!'
review['Review Text'].loc[51]
'I absolutely love this bib tee! it\'s probably my favorite retailer purchase of all time. i\'m 5\'7", 140 pounds and the small was a perfect fit for me. i typically wear either a s or m tops.'
words = review['Review Text'].str.split(expand=True).stack().value_counts()
words200 = words[:200]
fig = px.treemap(words200, path=[words200.index], values=0, width=900, height=900)
fig.update_layout(title_text='Top Frequent 200 Words in Review Text (Before Processing)')
fig.update_traces(textinfo="label+value")
fig.show()
# Removing Special Characters in Sentence
# ^ (Do Not match) \w (alphanumeric characters) and \s (white space and tab)
def removing_special_character(text):
new_text = re.sub('[^\w\s]','', text)
return new_text
# Tokenizing Sentence with all lower case
def tokenize_sentence(text):
text_tokens = nltk.word_tokenize(text.lower())
return text_tokens
# Removing Numbers from text sentence
def removing_numbers(text):
new_text = [x for x in text if x.isalpha()]
return new_text
# Removing Stopwords from text sentence
def removing_stopwords(text):
stopwords = nltk.corpus.stopwords.words("english")
new_text = [x for x in text if x not in stopwords]
return new_text
# Lemmatizing Sentence
def lemmatizer(text):
new_text = [WordNetLemmatizer().lemmatize(x) for x in text]
return new_text
# Joining the Tokenized Sentences
def join_token(text):
return " ".join(text)
review['Title'] = review['Title'].apply(removing_special_character)
review['Title'] = review['Title'].apply(tokenize_sentence)
review['Title'] = review['Title'].apply(removing_numbers)
review['Title'] = review['Title'].apply(removing_stopwords)
review['Title'] = review['Title'].apply(lemmatizer)
review['Title'] = review['Title'].apply(join_token)
review['Review Text'] = review['Review Text'].apply(removing_special_character)
review['Review Text'] = review['Review Text'].apply(tokenize_sentence)
review['Review Text'] = review['Review Text'].apply(removing_numbers)
review['Review Text'] = review['Review Text'].apply(removing_stopwords)
review['Review Text'] = review['Review Text'].apply(lemmatizer)
review['Review Text'] = review['Review Text'].apply(join_token)
review['Title'].loc[510]
'gorgeous flattering blouse'
review['Review Text'].loc[51]
'absolutely love bib tee probably favorite retailer purchase time im pound small perfect fit typically wear either top'
words = review['Review Text'].str.split(expand=True).stack().value_counts()
words200 = words[:200]
fig = px.treemap(words200, path=[words200.index], values=0, width=900, height=900)
fig.update_layout(title_text='Top Frequent 200 Words in Review Text (After Processing)')
fig.update_traces(textinfo="label+value")
fig.show()
positive = review[review['Recommended IND']==1].copy()
positive_words = positive['Review Text'].str.split(expand=True).stack().value_counts()
words200 = positive_words[:200]
fig = px.treemap(words200, path=[words200.index], values=0, width=900, height=900)
fig.update_layout(title_text='Top Frequent 200 Positive Words in Review Text (After Processing)')
fig.update_traces(textinfo="label+value")
fig.show()
negative = review[review['Recommended IND']==0].copy()
negative_words = negative['Review Text'].str.split(expand=True).stack().value_counts()
words200 = negative_words[:200]
fig = px.treemap(words200, path=[words200.index], values=0, width=900, height=900)
fig.update_layout(title_text='Top Frequent 200 Negative Words in Review Text (After Processing)')
fig.update_traces(textinfo="label+value")
fig.show()
1. Predicting using only Review Text
X1 = review["Review Text"]
Y1 = review["Recommended IND"]
# splitting test train data
X1_train, X1_test, Y1_train, Y1_test = train_test_split(X1, Y1, test_size=0.2)
print('Training Set:\tX1_train: ', X1_train.shape, ', Y1_train: ', Y1_train.shape,
'\nTesting Set:\tX1_test: ', X1_test.shape, ', Y1_test: ', Y1_test.shape)
Training Set: X1_train: (15729,) , Y1_train: (15729,) Testing Set: X1_test: (3933,) , Y1_test: (3933,)
vectorizer = CountVectorizer()
X1_train = vectorizer.fit_transform(X1_train)
X1_test = vectorizer.transform(X1_test)
nb = MultinomialNB()
nb.fit(X1_train, Y1_train)
nb_predict1 = nb.predict(X1_test)
print('Accuracy of Naive Bayes: {:.2f}'.format(nb.score(X1_test, Y1_test)))
Accuracy of Naive Bayes: 0.89
# Confusion Matrix of Naive Bayes using testset output and predicted output
print('Confusion Matrix:')
print(confusion_matrix(Y1_test, nb_predict1))
# Classification Report of Naive Bayes using testset output and predicted output
print('\nClassification Report:')
print(classification_report(Y1_test, nb_predict1))
Confusion Matrix:
[[ 415 305]
[ 143 3070]]
Classification Report:
precision recall f1-score support
0 0.74 0.58 0.65 720
1 0.91 0.96 0.93 3213
accuracy 0.89 3933
macro avg 0.83 0.77 0.79 3933
weighted avg 0.88 0.89 0.88 3933
2. Predicting using only Review Text where Positive Feedback Count is greater than 1
r2 = review.loc[review['Positive Feedback Count'] > 1]
X2 = r2['Review Text']
Y2 = r2['Recommended IND']
# splitting test train data
X2_train, X2_test, Y2_train, Y2_test = train_test_split(X2, Y2, test_size=0.2)
print('Training Set:\tX2_train: ', X2_train.shape, ', Y2_train: ', Y2_train.shape,
'\nTesting Set:\tX2_test: ', X2_test.shape, ', Y2_test: ', Y2_test.shape)
Training Set: X2_train: (5784,) , Y2_train: (5784,) Testing Set: X2_test: (1446,) , Y2_test: (1446,)
vectorizer = CountVectorizer()
X2_train = vectorizer.fit_transform(X2_train)
X2_test = vectorizer.transform(X2_test)
nb = MultinomialNB()
nb.fit(X2_train, Y2_train)
nb_predict2 = nb.predict(X2_test)
print('Accuracy of Naive Bayes: {:.2f}'.format(nb.score(X2_test, Y2_test)))
Accuracy of Naive Bayes: 0.87
# Confusion Matrix of Naive Bayes using testset output and predicted output
print('Confusion Matrix:')
print(confusion_matrix(Y2_test, nb_predict2))
# Classification Report of Naive Bayes using testset output and predicted output
print('\nClassification Report:')
print(classification_report(Y2_test, nb_predict2))
Confusion Matrix:
[[ 158 144]
[ 44 1100]]
Classification Report:
precision recall f1-score support
0 0.78 0.52 0.63 302
1 0.88 0.96 0.92 1144
accuracy 0.87 1446
macro avg 0.83 0.74 0.77 1446
weighted avg 0.86 0.87 0.86 1446
3. Predicting using only Title
X3 = review['Title']
Y3 = review['Recommended IND']
# splitting test train data
X3_train, X3_test, Y3_train, Y3_test = train_test_split(X3, Y3, test_size=0.2)
print('Training Set:\tX3_train: ', X3_train.shape, ', Y3_train: ', Y3_train.shape,
'\nTesting Set:\tX3_test: ', X3_test.shape, ', Y3_test: ', Y3_test.shape)
Training Set: X3_train: (15729,) , Y3_train: (15729,) Testing Set: X3_test: (3933,) , Y3_test: (3933,)
vectorizer = CountVectorizer()
X3_train = vectorizer.fit_transform(X3_train)
X3_test = vectorizer.transform(X3_test)
nb = MultinomialNB()
nb.fit(X3_train, Y3_train)
nb_predict3 = nb.predict(X3_test)
print('Accuracy of Naive Bayes: {:.2f}'.format(nb.score(X3_test, Y3_test)))
Accuracy of Naive Bayes: 0.87
# Confusion Matrix of Naive Bayes using testset output and predicted output
print('Confusion Matrix:')
print(confusion_matrix(Y3_test, nb_predict3))
# Classification Report of Naive Bayes using testset output and predicted output
print('\nClassification Report:')
print(classification_report(Y3_test, nb_predict3))
Confusion Matrix:
[[ 362 347]
[ 162 3062]]
Classification Report:
precision recall f1-score support
0 0.69 0.51 0.59 709
1 0.90 0.95 0.92 3224
accuracy 0.87 3933
macro avg 0.79 0.73 0.76 3933
weighted avg 0.86 0.87 0.86 3933
4. Predicting using only Title where Positive Feedback Count is greater than 1
r4 = review.loc[review['Positive Feedback Count'] > 1]
X4 = r4['Title']
Y4 = r4['Recommended IND']
# splitting test train data
X4_train, X4_test, Y4_train, Y4_test = train_test_split(X4, Y4, test_size=0.2)
print('Training Set:\tX4_train: ', X4_train.shape, ', Y4_train: ', Y4_train.shape,
'\nTesting Set:\tX4_test: ', X4_test.shape, ', Y4_test: ', Y4_test.shape)
Training Set: X4_train: (5784,) , Y4_train: (5784,) Testing Set: X4_test: (1446,) , Y4_test: (1446,)
vectorizer = CountVectorizer()
X4_train = vectorizer.fit_transform(X4_train)
X4_test = vectorizer.transform(X4_test)
nb = MultinomialNB()
nb.fit(X4_train, Y4_train)
nb_predict4 = nb.predict(X4_test)
print('Accuracy of Naive Bayes: {:.2f}'.format(nb.score(X4_test, Y4_test)))
Accuracy of Naive Bayes: 0.83
# Confusion Matrix of Naive Bayes using testset output and predicted output
print('Confusion Matrix:')
print(confusion_matrix(Y4_test, nb_predict4))
# Classification Report of Naive Bayes using testset output and predicted output
print('\nClassification Report:')
print(classification_report(Y4_test, nb_predict4))
Confusion Matrix:
[[ 126 187]
[ 52 1081]]
Classification Report:
precision recall f1-score support
0 0.71 0.40 0.51 313
1 0.85 0.95 0.90 1133
accuracy 0.83 1446
macro avg 0.78 0.68 0.71 1446
weighted avg 0.82 0.83 0.82 1446
5. Predicting using Recommended IND and Positive Feedback Count
X5 = review[['Rating', 'Positive Feedback Count']]
Y5 = review['Recommended IND']
# splitting test train data
X5_train, X5_test, Y5_train, Y5_test = train_test_split(X5, Y5, test_size=0.2)
print('Training Set:\tX5_train: ', X5_train.shape, ', Y5_train: ', Y5_train.shape,
'\nTesting Set:\tX5_test: ', X5_test.shape, ', Y5_test: ', Y5_test.shape)
Training Set: X5_train: (15729, 2) , Y5_train: (15729,) Testing Set: X5_test: (3933, 2) , Y5_test: (3933,)
# vectorizer = CountVectorizer()
# X5_train = vectorizer.fit_transform(X5_train)
# X5_test = vectorizer.transform(X5_test)
nb = MultinomialNB()
nb.fit(X5_train, Y5_train)
nb_predict5 = nb.predict(X5_test)
print('Accuracy of Naive Bayes: {:.2f}'.format(nb.score(X5_test, Y5_test)))
Accuracy of Naive Bayes: 0.76
# Confusion Matrix of Naive Bayes using testset output and predicted output
print('Confusion Matrix:')
print(confusion_matrix(Y4_test, nb_predict4))
# Classification Report of Naive Bayes using testset output and predicted output
print('\nClassification Report:')
print(classification_report(Y4_test, nb_predict4))
Confusion Matrix:
[[ 126 187]
[ 52 1081]]
Classification Report:
precision recall f1-score support
0 0.71 0.40 0.51 313
1 0.85 0.95 0.90 1133
accuracy 0.83 1446
macro avg 0.78 0.68 0.71 1446
weighted avg 0.82 0.83 0.82 1446
Predicting using only Review Text
Predicting using only Review Text where Positive Feedback Count is greater than 1
Predicting using only Title
Predicting using only Title where Positive Feedback Count is greater than 1
Predicting using Rating and Positive Feedback Count